Load libraries

library(tidyr)
library(ggpubr)
library(ggsci)
library(ComplexHeatmap)
library(GeneOverlap)
library(data.table)
library(kableExtra)
library(forcats)
library(mgsub)
library(DESeq2)
library(pheatmap)
library(ggplot2)
library(ggrepel)
library(RColorBrewer)
library(limma)
library(edgeR)
library(enrichR)
library(gridExtra)
library(stringr)
library(dplyr)
library(ensembldb)
library(EnsDb.Hsapiens.v86)
library("org.Hs.eg.db", character.only = TRUE)
library(factoextra)
library(rafalib)
library(tibble)
library(devtools)
library(clusterProfiler)
library(enrichplot)
library(ggprism)
library(msigdbr)

Set up the directories

# Define base directory
data_dir <- "../data/"

# Define gene counts directory
count_dir <- file.path(data_dir, "processed_data_nfcore/batch_1-2/salmon/")
## Gene counts file
count_file <- file.path(count_dir, "salmon.merged.gene_counts.tsv")

# Define metadata directory
sample_metadata_dir <- file.path(data_dir, "metadata/")

## Gene metadata file
gene_metadata_file <- file.path(sample_metadata_dir, "2023-11-222_mart_export_gene_ids_hsa.txt")
## Sample metadata file
sample_metadata_file <- file.path(sample_metadata_dir, "sample_metadata_1.csv")
## Patient metadata file
patient_metadata_file <- file.path(sample_metadata_dir, "patient_metadata.csv")

Load raw counts

Wrangle raw counts.

# Load raw counts
counts_raw <- data.table::fread(count_file,
  header = TRUE, sep = "\t"
) %>%
  as.data.frame() %>%
  select(-gene_id)

colnames(counts_raw) <- sub("_L001|_L002", "", colnames(counts_raw))
# Filter for only protein coding genes or mitochondrial/ribosomic genes
gene_id_metadata <- read.csv(gene_metadata_file) %>% filter(Gene.type %in% c("Mt_rRNA", "protein_coding", "rRNA"))
# Aggregate and sum up genes with same gene symbol (sum up isoform of genes)
counts_raw <- counts_raw[counts_raw$gene_name %in% gene_id_metadata$Gene.name, ]

counts_raw <- aggregate(counts_raw[, -1],
  list(gene_name = counts_raw[, 1]),
  FUN = sum
)

# Change row names to gene names
rownames(counts_raw) <- counts_raw$gene_name
counts_raw <- dplyr::select(counts_raw, -c(gene_name))

Load metadata

Match with raw counts.

# Load sample metadata
sampleTable <- read.table(sample_metadata_file,
  sep = ",", header = TRUE
) %>%
  dplyr::select(-c(age, sex))

# Load patient metadata
patient_metadata <- read.table(patient_metadata_file,
  sep = ",", header = TRUE
)

# Join sample and patient metadata
sampleTable <- sampleTable %>% left_join(patient_metadata, by = "subject_ID")

# Change visits to dose and time point...
sampleTable <- sampleTable %>%
  mutate(
    dose = as.factor(plyr::mapvalues(visit,
      from = c(1, 2, 4, 5, 10, 11, 12),
      to   = c("0_dose1", "24_dose1", "0_dose2", "24_dose2", "0_dose3", "24_dose3", "48_dose3")
    )),
    group = as.factor(group)
  ) %>%
  # ...(sex) 0 and 1 to male and female...
  mutate(
    sex = as.factor(plyr::mapvalues(sex,
      from = c(0, 1),
      to   = c("male", "female")
    )),
    group = as.factor(sex)
  ) %>%
  # ...(prior covid) 0 and 1 to neg and pos
  mutate(
    prior_covid19 = as.factor(plyr::mapvalues(prior_covid19,
      from = c(0, 1),
      to   = c("neg", "pos")
    )),
    group = as.factor(prior_covid19)
  )

# Remove 24dose_2 from subject 21 (missing 0dose_2)
sampleTable <- subset(sampleTable, sample_ID != "P22761_1037_S37")

# Remove one sample from subject 7 (sample replicate)
sampleTable <- subset(sampleTable, sample_ID != "P26208_1060_S62")

# Remove corresponding samples from count table
counts_raw <- subset(counts_raw, select = -c(P26208_1060_S62, P22761_1037_S37))

# Match count data with sample table
colnames(counts_raw) <- sampleTable$sample_ID
counts_raw <- counts_raw[, pmatch(sampleTable$sample_ID, colnames(counts_raw))]
all(rownames(sampleTable$sample_ID) == colnames(counts_raw))

Basic Quality Control

Inspect raw data

Plotting raw counts for first visual inspection. The boxplot shows median values of zero across all samples. In the histogram there is a huge peak of zero counts. Both indicate that the data set would benefit from a low count filtering.

# Visualize distribution of raw counts with boxplot and density plot
boxplot(log2(as.matrix(counts_raw) + 1),
  ylab = expression("Log"[2] ~ "Read counts"),
  las = 2,
  main = "Raw data"
)

hist(log2(as.matrix(counts_raw) + 1),
  ylab = "",
  las = 2,
  main = "Raw data"
)

Filtering

Plot number of genes detected across samples. All samples are more or less close to average so we don’t need to discard any samples.

barplot(colSums(counts_raw > 3),
  ylab = "Number of detected genes",
  las = 2
) +
  abline(h = median(colSums(counts_raw > 3)))

Removing reads with the log2 of the counts per million (cpm) lower than 1 (= 45896 genes). Plot genes detection rate across samples, comparing raw and filtered counts.

meanLog2CPM <- rowMeans(log2(cpm(counts_raw) + 1))
counts_filtered <- counts_raw[meanLog2CPM > 1, ]

hist(rowSums(counts_raw > 3), main = title("Raw Counts"))

hist(rowSums(counts_filtered > 3), main = title("Filtered Counts"))

Plot distribution of the filtered counts

boxplot(log2(as.matrix(counts_filtered) + 1),
  ylab = expression("Log"[2] ~ "Read counts"),
  las = 2,
  main = "Filtered data"
)

hist(log2(as.matrix(counts_filtered) + 1),
  ylab = "",
  las = 2,
  main = "Filtered data"
)

Create DESeq object

Generating three separate DESeq objects with different design parameters to allow for more comparisons. Controlling for age and sex.

# Remove vaccine/sars-cov-2 related genes
counts_filtered <- counts_filtered[!grepl("vaccine|spike|sars_cov", rownames(counts_filtered)), ]

# Prepare for DESeq by creating new variables and factorizing
sampleTable <- sampleTable %>%
  mutate(
    conditions = paste0(sampleTable$group, "_", sampleTable$dose),
    time = gsub(".{6}$", "", conditions),
    conditions = factor(conditions),
    time = factor(time),
    sex = factor(sex),
    dose = factor(dose, levels = c("0_dose1", "24_dose1", "0_dose2", "24_dose2", "0_dose3", "24_dose3", "48_dose3")),
    group = factor(group),
    batch = factor(batch)
  )

# Create DESeq objects with different designs; comparing against conditions, time and dose
dds1 <- DESeqDataSetFromMatrix(
  countData = round(counts_filtered),
  colData = as.data.frame(sampleTable),
  design = ~ sex + age + batch + conditions
)

dds2 <- DESeqDataSetFromMatrix(
  countData = round(counts_filtered),
  colData = as.data.frame(sampleTable),
  design = ~ sex + age + batch + time
)

dds3 <- DESeqDataSetFromMatrix(
  countData = round(counts_filtered),
  colData = as.data.frame(sampleTable),
  design = ~ sex + age + batch + dose
)

dds4 <- DESeqDataSetFromMatrix(
  countData = round(counts_filtered),
  colData = as.data.frame(sampleTable),
  design = ~ sex + age + batch + dose + group + dose:group
)

VST normalization

Normalize with variance stabilizing transformation Plot distribution of data after normalization

# Normalize with variance stabilizing transformation for later PCA and heatmap
counts_vst_normalized <- vst(dds1, blind = TRUE)

vst_matrix <- assay(counts_vst_normalized)
hist(vst_matrix)

boxplot(vst_matrix,
  ylab = expression("Log"[2] ~ "Read counts"),
  las = 2,
  main = "VST"
)

Heatmap

Hierarchical clustering for initial inspection of sample similarity and gene expression patterns.

# Sample heatmap
sampleDist <- cor(vst_matrix, method = "spearman")

Metadata <- data.frame(sampleTable$group, sampleTable$dose)
names(Metadata) <- c("Group", "Dose")
rownames(Metadata) <- sampleTable$sample_ID

# Plot heatmap
colors <- colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(255)

Heatmap <- pheatmap(sampleDist,
  color = colors,
  clustering_distance_rows = as.dist(1 - sampleDist),
  clustering_distance_cols = as.dist(1 - sampleDist),
  show_rownames = FALSE,
  show_colnames = FALSE,
  clustering_method = "ward.D2",
  annotation_col = Metadata
)

print(Heatmap)

Principal Component Analysis

Dimensionality reduction for evaluating outliers and global sample clusters for quality control and primary exploratory analysis.

# PCA plot
pcaRes <- prcomp(t(assay(counts_vst_normalized)))
varExp <- round(summary(pcaRes)[[1]], digits = 2)

pcaDF <- data.frame(
  PC1 = pcaRes$x[, 1],
  PC2 = pcaRes$x[, 2],
  Group = sampleTable$group,
  Sample = sampleTable$sample_ID,
  Dose = sampleTable$dose
)

pcaPlot <- ggplot(pcaDF, mapping = aes(x = PC1, y = PC2, color = Group, label = Sample)) +
  geom_point(aes(shape = Group), size = 3) +
  labs(
    x = paste0("PC1 (", varExp[1], " %)"),
    y = paste0("PC2 (", varExp[2], " %)")
  ) +
  theme(
    axis.text = element_text(size = 12),
    legend.text = element_text(size = 10)
  ) +
  scale_color_manual(values = brewer.pal(3, "Accent")) +
  cowplot::theme_cowplot()

print(pcaPlot)

# Scree plot, showing contribution of principal components in descending order
pca.var <- pcaRes$sdev^2
pca.var.per <- round(pca.var / sum(pca.var) * 100, 1)

barplot(pca.var.per,
  main = "Scree Plot",
  xlab = "Principal Component",
  ylab = "Percent Variation"
)

# Explore which variables contribute the most for each PC
var <- get_pca_var(pcaRes)
var$contrib %>%
  as.data.frame() %>%
  arrange(desc(Dim.1)) %>%
  head() %>%
  kableExtra::kable() %>%
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6 Dim.7 Dim.8 Dim.9 Dim.10 Dim.11 Dim.12 Dim.13 Dim.14 Dim.15 Dim.16 Dim.17 Dim.18 Dim.19 Dim.20 Dim.21 Dim.22 Dim.23 Dim.24 Dim.25 Dim.26 Dim.27 Dim.28 Dim.29 Dim.30 Dim.31 Dim.32 Dim.33 Dim.34 Dim.35 Dim.36 Dim.37 Dim.38 Dim.39 Dim.40 Dim.41 Dim.42 Dim.43 Dim.44 Dim.45 Dim.46 Dim.47 Dim.48 Dim.49 Dim.50 Dim.51 Dim.52 Dim.53 Dim.54 Dim.55 Dim.56 Dim.57 Dim.58 Dim.59 Dim.60 Dim.61 Dim.62 Dim.63 Dim.64 Dim.65 Dim.66 Dim.67 Dim.68 Dim.69 Dim.70 Dim.71 Dim.72 Dim.73 Dim.74 Dim.75 Dim.76 Dim.77 Dim.78 Dim.79 Dim.80 Dim.81 Dim.82 Dim.83 Dim.84 Dim.85 Dim.86 Dim.87 Dim.88 Dim.89 Dim.90 Dim.91 Dim.92 Dim.93 Dim.94 Dim.95 Dim.96 Dim.97
ANKRD22 0.3907566 0.3707759 0.1763453 0.0697718 0.0223635 0.0447039 0.0114715 0.0781899 0.0020209 0.0021541 0.0015474 0.0081783 0.0088655 0.0381066 0.0000211 0.0425199 0.0027675 0.0080813 0.0002822 0.0596314 0.0141149 0.0612235 0.0000446 0.0468783 0.0127597 0.0627942 0.0002991 0.0767245 0.0836628 0.1805330 0.0295422 0.0209039 0.0430086 0.0242306 0.0472737 0.0096422 0.0526057 0.0641638 0.0016640 0.0024777 0.0088926 0.0301645 0.0416714 0.0412024 0.0189615 0.1289343 0.0132374 0.0025781 0.0326548 0.0011627 0.0377771 0.0854475 0.0100276 0.0036267 0.0145649 0.0882436 0.0000000 0.0568559 0.0114556 0.0219706 0.0291159 0.0022544 0.0357510 0.0288985 0.0010657 0.0129628 0.0032932 0.0031827 0.0101765 0.0117289 0.0204692 0.0416297 0.0144796 0.0990382 0.0173874 0.0007648 0.0102040 0.0397444 0.0380011 0.0002510 0.0067175 0.0000023 0.0088305 0.0259774 0.0017850 0.0088491 0.0001625 0.0032289 0.0366255 0.0028255 0.0087858 0.0084940 0.0878354 0.0104456 0.1651719 0.0100752 0.0075298
CD274 0.3219816 0.1684654 0.0509356 0.0072978 0.0147743 0.0267557 0.0013991 0.0611167 0.0209512 0.0000531 0.0025820 0.0116135 0.0033795 0.0076675 0.0085935 0.0130656 0.0051637 0.0138596 0.0050288 0.0329616 0.0576326 0.0324778 0.0071727 0.0047896 0.0199693 0.0022324 0.0371298 0.0040813 0.0291641 0.0814194 0.0115577 0.0365728 0.0054533 0.0128201 0.0218089 0.0003426 0.0487911 0.0001828 0.1057181 0.0012723 0.0000004 0.0132958 0.0055776 0.0816477 0.0263703 0.0006540 0.0009068 0.0057047 0.0008221 0.0108240 0.0179337 0.0000878 0.0193121 0.0037728 0.0072008 0.0097795 0.0330921 0.0018139 0.0003408 0.0067511 0.0240466 0.0003445 0.0016070 0.0000475 0.0001970 0.0463836 0.0521358 0.0118571 0.0123785 0.0000199 0.0016967 0.0058354 0.0044445 0.0153624 0.0051562 0.0024172 0.0000045 0.0009116 0.0019972 0.0109891 0.0280394 0.0064917 0.0000974 0.0004984 0.0229813 0.0124543 0.0035928 0.0010370 0.0211658 0.0124155 0.0263697 0.0094679 0.0045309 0.0009275 0.0216106 0.0039971 0.0000234
RSAD2 0.2947192 0.2049546 0.4974586 0.0318281 0.0073626 0.0210826 0.1010576 0.1187275 0.3007710 0.0244298 0.0190231 0.0541176 0.1354022 0.1027793 0.0013662 0.0521347 0.3049723 0.1612313 0.2140448 0.0875979 0.2062015 0.0006657 0.0195654 0.0034210 0.0896312 0.0676904 0.3459785 0.0596110 0.0071078 0.1060516 0.0247890 0.0498140 0.0573276 0.0627742 0.1763421 0.0151587 0.0992023 0.2075634 0.0092668 0.0024749 0.0908292 0.0034130 0.0096694 0.0851753 0.0208146 0.2071238 0.0231645 0.0032822 0.0006856 0.0008489 0.0002554 0.0006178 0.0037641 0.0874743 0.0213023 0.0303004 0.0560511 0.0066217 0.0010099 0.0583976 0.0051901 0.0005496 0.0134009 0.0022981 0.0466030 0.0241285 0.0636444 0.0266754 0.0555331 0.0322514 0.0037240 0.1087341 0.0033155 0.0144053 0.0351500 0.0230706 0.1349499 0.0232548 0.0000171 0.0012495 0.0225385 0.0487633 0.0329394 0.0104405 0.0000561 0.0597059 0.0241697 0.0737168 0.0018283 0.0222574 0.0007056 0.0111165 0.0012408 0.0163556 0.0005637 0.0001130 0.0008663
GBP5 0.2775365 0.2180647 0.1971194 0.0086061 0.0183571 0.0078968 0.0053304 0.0279666 0.0727507 0.0038873 0.0049237 0.0522437 0.0013829 0.0003160 0.0000344 0.0333063 0.0164076 0.0083372 0.0022120 0.0181464 0.0186165 0.0805872 0.0097645 0.0662536 0.0071108 0.0014847 0.0011171 0.0274428 0.0232320 0.0657584 0.0634762 0.0193610 0.0094466 0.0309201 0.0724822 0.0000920 0.0808818 0.0093290 0.0008618 0.0260111 0.0137608 0.0245504 0.0085836 0.0415994 0.0179343 0.0045942 0.0109688 0.0088312 0.0010925 0.0314393 0.0011057 0.0542982 0.0579596 0.0756799 0.0092425 0.0511590 0.0000682 0.0555034 0.0236939 0.0009586 0.0094485 0.0001229 0.0169044 0.0016760 0.0020045 0.0009688 0.0014923 0.0013279 0.0190562 0.0053548 0.0080992 0.0026682 0.0068683 0.0016553 0.0042436 0.0114684 0.0087335 0.0040431 0.0044237 0.0032285 0.0028552 0.0031691 0.0018202 0.0000005 0.0020222 0.0005591 0.0027866 0.0000328 0.0019157 0.0003959 0.0006496 0.0003764 0.0008036 0.0035706 0.0004326 0.0014613 0.0217768
IDO1 0.2695485 0.3902585 0.3248260 0.0000503 0.0519635 0.1134049 0.0029245 0.1120883 0.1451199 0.0130168 0.0051264 0.0583079 0.0002519 0.0827080 0.1974039 0.0133183 0.1286804 0.0046578 0.2177188 0.2242944 0.0227812 0.0020528 0.1392108 0.0628725 0.0136930 0.0720865 0.0187798 0.0345214 0.0001039 0.0100099 0.0079035 0.0055624 0.0013207 0.0219797 0.0235757 0.3588397 0.0533840 0.0765474 0.0590732 0.1075406 0.0999469 0.0085405 0.1487057 0.0660700 0.0000173 0.0021043 0.2139270 0.0396101 0.3171282 0.0025444 0.0486026 0.0092592 0.0009588 0.0905524 0.0000463 0.0283794 0.0035875 0.0007088 0.1144612 0.0115228 0.0750467 0.0412158 0.0003541 0.0217175 0.0016044 0.0100588 0.3301425 0.0028503 0.0020741 0.0060669 0.0281377 0.0035831 0.0793929 0.0050499 0.0124676 0.1588564 0.0214707 0.0216549 0.0480618 0.0003953 0.0001289 0.0000622 0.0857592 0.1182902 0.0056783 0.0233494 0.1111740 0.0261812 0.4302242 0.0971284 0.0191912 0.0041599 0.0525118 0.0103630 0.0006946 0.0497602 0.0005033
IFI44L 0.2559941 0.0967018 0.4042610 0.0589702 0.0129810 0.0337118 0.0405968 0.1877308 0.2295361 0.0109766 0.0037365 0.0305678 0.1265156 0.0877680 0.0001247 0.0229197 0.0788515 0.0973684 0.1814140 0.0630997 0.0746943 0.0132093 0.0005669 0.0030227 0.0385829 0.0697889 0.1785369 0.0849351 0.0743779 0.0034113 0.0234150 0.0566579 0.0104426 0.0518468 0.0559481 0.0033853 0.0070485 0.2408779 0.0142019 0.0109448 0.0550837 0.0000410 0.0045375 0.2019914 0.0003672 0.0917498 0.0227366 0.0228853 0.0080531 0.0020156 0.0008918 0.0005016 0.0062675 0.0450977 0.0529248 0.0159847 0.0241092 0.0001956 0.0291446 0.0106654 0.0127238 0.0005716 0.0040855 0.0120674 0.0085962 0.0000152 0.0261820 0.0015791 0.0224153 0.0000002 0.0025845 0.0304191 0.0050129 0.0102226 0.0628257 0.0202575 0.1363499 0.0000662 0.0113016 0.0076377 0.0334686 0.0000350 0.0011456 0.0262246 0.0000334 0.0702977 0.0071828 0.0351828 0.0078437 0.0024033 0.0002701 0.0025880 0.0036008 0.0000067 0.0788306 0.0472378 0.0017065

Differential Gene Expression Analysis

Differential Gene Expression Analysis using DESeq2. Comparing parameters; dose, time point, prior infection status.

# Run the DESeq2 analysis
dds1 <- DESeq(dds1)
dds2 <- DESeq(dds2)
dds3 <- DESeq(dds3)
ddsTC <- DESeq(dds4, test = "LRT", reduced = ~ dose + group)

# Contrast
resultsNames(dds1)
resultsNames(dds2)
resultsNames(dds3)
resultsNames(ddsTC)

# Create function for makings comparisons with default being conditions and dds1
compare_DEGs <- function(variable1, variable2, condition = "conditions", dds = dds1) {
  as.data.frame(results(dds, contrast = c(condition, variable1, variable2)))
}

# Baselines DEGs
base_pos_neg_d1 <- compare_DEGs("pos_0_dose1", "neg_0_dose1")
base_pos_neg_d2 <- compare_DEGs("pos_0_dose2", "neg_0_dose2")
base_pos_neg_d3 <- compare_DEGs("pos_0_dose3", "neg_0_dose3")

base_pos_d1_d2 <- compare_DEGs("pos_0_dose2", "pos_0_dose1")
base_pos_d1_d3 <- compare_DEGs("pos_0_dose3", "pos_0_dose1")
base_pos_d2_d3 <- compare_DEGs("pos_0_dose2", "pos_0_dose3")

base_neg_d1_d2 <- compare_DEGs("neg_0_dose2", "neg_0_dose1")
base_neg_d1_d3 <- compare_DEGs("neg_0_dose3", "neg_0_dose1")
base_neg_d2_d3 <- compare_DEGs("neg_0_dose2", "neg_0_dose3")

# 0-24h and 0-48 (3rd dose) DEGs
neg_d1 <- compare_DEGs("neg_24_dose1", "neg_0_dose1")
neg_d2 <- compare_DEGs("neg_24_dose2", "neg_0_dose2")
neg_d3_24 <- compare_DEGs("neg_24_dose3", "neg_0_dose3")
neg_d3_48 <- compare_DEGs("neg_48_dose3", "neg_0_dose3")

pos_d1 <- compare_DEGs("pos_24_dose1", "pos_0_dose1")
pos_d2 <- compare_DEGs("pos_24_dose2", "pos_0_dose2")
pos_d3_24 <- compare_DEGs("pos_24_dose3", "pos_0_dose3")
pos_d3_48 <- compare_DEGs("pos_48_dose3", "pos_0_dose3")

# Results dds2
pos <- compare_DEGs("pos_24", "pos_0", condition = "time", dds = dds2)
neg <- compare_DEGs("neg_24", "neg_0", condition = "time", dds = dds2)

# Results dds3
d1 <- compare_DEGs("24_dose1", "0_dose1", condition = "dose", dds = dds3)
d2 <- compare_DEGs("24_dose2", "0_dose2", condition = "dose", dds = dds3)
d3_24 <- compare_DEGs("24_dose3", "0_dose3", condition = "dose", dds = dds3)
d3_48 <- compare_DEGs("48_dose3", "0_dose3", condition = "dose", dds = dds3)

# Results dds4 intercept
resTC <- results(ddsTC)

Figure 2 A, S5 A-C Volcano plots

Volcano plot, filtering for significant values with False Discovery Rate < 0.05 and log fold change greater than 1.

results_list <- list(
  base_pos_neg_d1,
  base_pos_neg_d2,
  base_pos_neg_d3,
  base_pos_d1_d2,
  base_pos_d1_d3,
  base_pos_d2_d3,
  base_neg_d1_d2,
  base_neg_d1_d3,
  base_neg_d2_d3,
  neg_d1,
  neg_d2,
  neg_d3_24,
  neg_d3_48,
  pos_d1,
  pos_d2,
  pos_d3_24,
  pos_d3_48,
  pos,
  neg,
  d1,
  d2,
  d3_24,
  d3_48
)

names(results_list) <- c(
  "Baseline Dose 1 Conv vs Naïve",
  "Baseline Dose 2 Conv vs Naïve",
  "Baseline Dose 3 Conv vs Naïve",
  "Baseline Conv Dose 1 and 2",
  "Baseline Conv Dose 1 and 3",
  "Baseline Conv Dose 2 and 3",
  "Baseline Naïve Dose 1 and 2",
  "Baseline Naïve Dose 1 and 3",
  "Baseline Naïve Dose 2 and 3",
  "Naïve Dose 1 0-24 hours",
  "Naïve Dose 2 0-24 hours",
  "Naïve Dose 3 0-24 hours",
  "Naïve Dose 3 0-48 hours",
  "Conv Dose 1 0-24 hours",
  "Conv Dose 2 0-24 hours",
  "Conv Dose 3 0-24 hours",
  "Conv Dose 3 0-48 hours",
  "Conv 0-24 hours",
  "Naïve 0-24 hours",
  "Dose 1 0-24 hours",
  "Dose 2 0-24 hours",
  "Dose 3 0-24 hours",
  "Dose 3 0-48 hours"
)

results_list_plot <- lapply(results_list, function(x) {
  x <- x %>% mutate(
    test_padj = case_when(
      padj < 0.05 & log2FoldChange >= 1 ~ "Up regulated",
      padj < 0.05 & log2FoldChange <= -1 ~ "Down regulated",
      TRUE ~ "Not significant"
    ),
    test_pvalue = case_when(
      pvalue < 0.05 & log2FoldChange >= 1 ~ "Up regulated",
      pvalue < 0.05 & log2FoldChange <= -1 ~ "Down regulated",
      TRUE ~ "Not significant"
    ),
  )
})

# function to plot volcano plots
volcano_plot <- function(x, pvalue_selection = c("padj", "pvalue")) {
  if (pvalue_selection == "padj") {
    col_name_pval <- "test_padj"
  } else if (pvalue_selection == "pvalue") {
    col_name_pval <- "test_pvalue"
  } else {
    stop("Please change the pvalue_selection argument value. It should either `padj` or `pvalue`.")
  }
  subsetted <- subset(results_list_plot[[x]] %>% tibble::rownames_to_column("gene_symbol"), abs(log2FoldChange) >= 1 & get(pvalue_selection) < 0.05)
  label_list <- c("RSAD2", "IFIT1", "CXCL10", "CMPK2", "GBP1P1", "IFI44L", "BATF2", "SIGLEC1", "IFI44", "NDUFS4", "MX1", "CXCL9", "FCGR1A", "FCGR1B", "CCL4")
  label <- subsetted %>% filter(gene_symbol %in% label_list)

  # count up regulated DEGs
  deg_number_up <- subset(
    results_list_plot[[x]] %>%
      tibble::rownames_to_column("gene_symbol"),
    log2FoldChange >= 1 & get(pvalue_selection) < 0.05
  ) %>%
    nrow()
  # count down regulated DEGs
  deg_number_down <- subset(
    results_list_plot[[x]] %>%
      tibble::rownames_to_column("gene_symbol"),
    log2FoldChange <= -1 & get(pvalue_selection) < 0.05
  ) %>%
    nrow()


  results_list_plot[[x]] %>%
    ggplot(aes(x = log2FoldChange, y = -log10(get(pvalue_selection)))) +
    geom_point(aes(color = get(col_name_pval)), size = 3, alpha = 0.3) +
    scale_color_manual(values = c("Down regulated" = "blue", "Not significant" = "grey80", "Up regulated" = "red")) +
    xlab(expression("Fold Change (Log"[2] * ")")) +
    ylab(expression(paste0("-Log"[10] * "(", pvalue_selection, ")"))) +
    labs(x = NULL, y = NULL) +
    geom_vline(xintercept = c(-1), linetype = "dotted", size = 1) +
    geom_vline(xintercept = c(1), linetype = "dotted", size = 1) +
    geom_hline(yintercept = -log10(0.05), linetype = "dotted", size = 1) +
    geom_label_repel(data = label, aes(log2FoldChange, -log10(get(pvalue_selection)), label = gene_symbol)) +
    xlim(-2, 6) +
    ylim(0, 20) +
    ggtitle(x) +
    annotate(geom = "text", colour = "red", size = 10, x = 4, y = 20, hjust = 0, label = paste0(deg_number_up)) +
    annotate(geom = "text", colour = "blue", size = 10, x = -2, y = 20, hjust = 0, label = paste0(deg_number_down)) +
    ggprism::theme_prism() +
    theme(axis.ticks = element_line(size = .5), legend.position = "none")
}


volcano_plots <- lapply(names(results_list_plot), volcano_plot, pvalue_selection = "padj")
names(volcano_plots) <- names(results_list_plot)
volcano_plots

Gene Set Enrichment Analysis

GSEA looks for patterns of enriched genes which are related to each other. Compared with over-representation analysis this does not rely solely on DEGs. Therefore, this method is advantageous when the differences in gene expression is smaller.

Database used for enrichment analysis, “Reactome”. Also testing the Blood Transcriptome modules presented in Li et al. 2014.

Prepare Input

gmt <- read.gmt("../data/blood_transcription_module/BTM_for_GSEA_20131008.gmt")
gmt.reactome <- read.gmt("../data/metabolic_transcriptionmodule/MSigDB_reactome.gmt")
process_to_gsea <- function(x) {
  original_gene_list <- x$log2FoldChange
  names(original_gene_list) <- rownames(x)
  gene_list <- na.omit(original_gene_list)
  gene_list <- sort(gene_list, decreasing = TRUE)
}

ls_processed <- lapply(results_list, process_to_gsea)

Blood Transcription Modules

list_gse <- lapply(ls_processed, function(x) {
  GSEA(
    geneList = x,
    TERM2GENE = gmt,
    minGSSize = 3,
    maxGSSize = 800,
    nPermSimple = 10000,
    pvalueCutoff = 0.05,
    verbose = TRUE,
    pAdjustMethod = "fdr"
  )
})

MSigDb Reactome

list_gse_reactome <- lapply(ls_processed, function(x) {
  GSEA(
    geneList = x,
    TERM2GENE = gmt.reactome,
    minGSSize = 3,
    maxGSSize = 800,
    nPermSimple = 10000,
    pvalueCutoff = 0.05,
    verbose = TRUE,
    pAdjustMethod = "fdr"
  )
})

Figure 4 B Heatmap Blood Transcriptome Modules

Gene set enrichment analysis based on fold change ranking using the blood transcriptome modules presented in Li et al. 2014. Cut-off; FDR-adjusted p-value < 0.05. N = 15

my_palette <- colorRampPalette(c("#265891", "white", "#B80F20"))(n = 100)
sel_list_gse <- list_gse[c(
  "Naïve Dose 1 0-24 hours",
  "Naïve Dose 2 0-24 hours",
  "Naïve Dose 3 0-24 hours",
  "Naïve Dose 3 0-48 hours",
  "Conv Dose 1 0-24 hours",
  "Conv Dose 2 0-24 hours",
  "Conv Dose 3 0-24 hours",
  "Conv Dose 3 0-48 hours"
)]

combined_gsea <- data.table::rbindlist(lapply(sel_list_gse, as.data.frame), idcol = TRUE) %>%
  group_by(.id) %>%
  arrange(desc(NES)) %>%
  ungroup()

to_heatmap <- combined_gsea %>%
  select(.id, ID, NES) %>%
  filter(NES > 2 | NES < -2, !grepl("TBA", ID)) %>%
  tidyr::pivot_wider(values_from = NES, names_from = ID) %>%
  mutate(across(where(anyNA), ~ tidyr::replace_na(., 0))) %>%
  t()
colnames(to_heatmap) <- to_heatmap[1, ]
to_heatmap <- to_heatmap[-1, ]

col_order <- c(
  "Conv Dose 1 0-24 hours",
  "Naïve Dose 1 0-24 hours",
  "Conv Dose 2 0-24 hours",
  "Naïve Dose 2 0-24 hours",
  "Conv Dose 3 0-24 hours",
  "Naïve Dose 3 0-24 hours",
  "Conv Dose 3 0-48 hours",
  "Naïve Dose 3 0-48 hours"
)

to_heatmap <- to_heatmap[, col_order]

BTM_heatmap <- matrix(as.numeric(to_heatmap), ncol = ncol(to_heatmap)) %>%
  pheatmap::pheatmap(
    width = 6,
    height = 20,
    cellwidth = 20,
    cellheight = 5,
    cluster_rows = TRUE,
    cluster_cols = FALSE,
    clustering_method = "complete",
    angle_col = 45,
    border_color = "black",
    cutree_cols = 4,
    color = my_palette,
    labels_col = colnames(to_heatmap),
    labels_row = rownames(to_heatmap),
    treeheight_col = 1,
    fontsize_row = 6,
    fontsize_col = 8,
    gaps_col = c(2, 4, 6)
  )

Figure 2 C Fold change comparison

Pearson’s correlation of fold changes in individual genes identified as differentially regulated in any group. N = 15. Comparison between dose 2 and dose 3.

extract_gene_names <- function(x) {
  subsetted <- results_list_plot[[x]] %>%
    tibble::rownames_to_column("gene_symbol") %>%
    select(gene_symbol, log2FoldChange, padj)
  return(subsetted)
}

gene_names <- lapply(names(results_list_plot), extract_gene_names)
names(gene_names) <- names(results_list_plot)

gene_names_filt <- gene_names[c(
  "Naïve Dose 1 0-24 hours", "Naïve Dose 2 0-24 hours",
  "Naïve Dose 3 0-24 hours", "Naïve Dose 3 0-48 hours",
  "Conv Dose 1 0-24 hours", "Conv Dose 2 0-24 hours",
  "Conv Dose 3 0-24 hours", "Conv Dose 3 0-48 hours"
)]

gene_names_df_filt <- rbindlist(gene_names_filt, fill = TRUE, idcol = "group_timepoint") %>%
  mutate(
    timepoint = case_when(
      grepl("Dose 1", group_timepoint) ~ "Dose 1",
      grepl("Dose 2", group_timepoint) ~ "Dose 2",
      grepl("Dose 3 0-24", group_timepoint) ~ "Dose 3",
      grepl("Dose 3 0-48", group_timepoint) ~ "Dose 3 48h"
    ),
    group = case_when(
      grepl("Conv", group_timepoint) ~ "Conv",
      grepl("Naïve", group_timepoint) ~ "Naïve"
    )
  )

gene_names_sign_df <- gene_names_df_filt %>%
  select(-group_timepoint) %>%
  pivot_wider(names_from = group, values_from = c(padj, log2FoldChange), names_glue = "{.value}_{group}") %>%
  mutate(
    # Categorize genes into 'degs_presence' based on significance and regulation
    degs_presence = case_when(
      padj_Naïve < 0.05 & padj_Conv < 0.05 & (log2FoldChange_Conv * log2FoldChange_Naïve) > 0 ~ "Both significant",
      padj_Naïve < 0.05 ~ "Naïve only",
      padj_Conv < 0.05 ~ "Conv only",
      TRUE ~ "non_significant"
    ),
    regulation = case_when(
      log2FoldChange_Conv > 0 & log2FoldChange_Naïve > 0 ~ "Upregulated",
      log2FoldChange_Conv < 0 & log2FoldChange_Naïve < 0 ~ "Downregulated",
      log2FoldChange_Conv > 0 & log2FoldChange_Naïve < 0 ~ "Upregulated Conv",
      log2FoldChange_Conv < 0 & log2FoldChange_Naïve > 0 ~ "Upregulated Naïve"
    )
  )

filtered_gene_doses <- gene_names_sign_df %>%
  filter(!timepoint %in% c("Dose 1", "Dose 3 48h"))

g1 <- filtered_gene_doses %>%
  filter(degs_presence != "non_significant") %>%
  ggplot(aes(log2FoldChange_Naïve, log2FoldChange_Conv, color = degs_presence)) +
  geom_point(data = subset(filtered_gene_doses, degs_presence == "non_significant"), alpha = .3, color = "grey80") +
  geom_point(data = subset(filtered_gene_doses, degs_presence == "Naïve only"), alpha = .3) +
  geom_point(data = subset(filtered_gene_doses, degs_presence == "Conv only"), alpha = .3) +
  geom_point(data = subset(filtered_gene_doses, degs_presence == "Both significant"), alpha = .3) +
  scale_color_viridis_d() +
  scale_y_continuous(limits = c(-6, 6)) +
  scale_x_continuous(limits = c(-6, 6)) +
  geom_smooth(data = subset(filtered_gene_doses, degs_presence != "non_significant", se = TRUE), method = "lm") +
  scale_color_npg() +
  stat_cor(method = "pearson", cor.coef.name = "r", p.accuracy = 0.001) +
  facet_wrap(~timepoint) +
  theme_prism(base_line_size = .5, base_fontface = "plain") +
  theme(
    axis.ticks = element_line(size = .5), legend.position = "top",
    aspect.ratio = 1
  )

g1

Figure S4 B Fold change comparison

Fold changes in individual genes identified as significantly differentially regulated in any group, Dose 1 and Dose 3.

filtered_gene_doses <- gene_names_sign_df %>%
  filter(timepoint %in% c("Dose 1", "Dose 3 48h"))

g2 <- filtered_gene_doses %>%
  filter(degs_presence != "non_significant") %>%
  ggplot(aes(log2FoldChange_Naïve, log2FoldChange_Conv, color = degs_presence)) +
  geom_point(data = subset(filtered_gene_doses, degs_presence == "non_significant"), alpha = .3, color = "grey80") +
  geom_point(data = subset(filtered_gene_doses, degs_presence == "Naïve only"), alpha = .3) +
  geom_point(data = subset(filtered_gene_doses, degs_presence == "Conv only"), alpha = .3) +
  geom_point(data = subset(filtered_gene_doses, degs_presence == "Both significant"), alpha = .3) +
  scale_color_viridis_d() +
  scale_y_continuous(limits = c(-6, 6)) +
  scale_x_continuous(limits = c(-6, 6)) +
  geom_smooth(data = subset(filtered_gene_doses, degs_presence != "non_significant", se = TRUE), method = "lm") +
  scale_color_npg() +
  stat_cor(method = "pearson", cor.coef.name = "r", p.accuracy = 0.001) +
  facet_wrap(~timepoint) +
  theme_prism(base_line_size = .5, base_fontface = "plain") +
  theme(
    axis.ticks = element_line(size = .5), legend.position = "top",
    aspect.ratio = 1
  )

g2

Jaccard index of DEGs

The Jaccard index is a measure of overlap between two sets of genes, in this case the DEGs between the convalascent and naive groups. Calculated using the GeneOverlap package.

degs <- lapply(gene_names_filt, function(x) {
  x %>%
    filter(padj < 0.05, log2FoldChange > 1) %>%
    pull(gene_symbol)
})
degs <- degs[-c(1, 4, 8)]

gom_obj <- newGOM(degs, degs)

filtered_degs_matrix <- getMatrix(gom_obj, "Jaccard")

Heatmap Jaccard index

my_palette <- colorRampPalette(c("#265891", "#F6FF92", "#B80F20"))(n = 50)
# order matrix based on cluster
od <- hclust(dist(filtered_degs_matrix), method = "ward.D2")$order
filtered_degs_matrix_ordered <- filtered_degs_matrix[od, od]

# Define color values for different row annotations
row_col_group <- c("Naïve" = "#A9A9A9", "Conv" = "#A9CAF6")
row_col_time <- c(
  "Dose 1 0-24 hours" = "#5e4fa2",
  "Dose 2 0-24 hours" = "#fdae61",
  "Dose 3 0-24 hours" = "#9e0142",
  "Dose 3 0-48 hours" = "#abdda4"
)

# Create row annotations for heatmap
row_anno <- rowAnnotation(
  timepoint = gsub(".*? (.*)$", "\\1", rownames(filtered_degs_matrix_ordered)),
  group = gsub("^(.*?) .*", "\\1", rownames(filtered_degs_matrix_ordered)),
  col = list(timepoint = row_col_time, group = row_col_group),
  annotation_legend_param = list(
    timepoint = list(title = "Timepoint"),
    group = list(title = "Group")
  )
)

col_anno <- columnAnnotation(
  timepoint = gsub(".*? (.*)$", "\\1", rownames(filtered_degs_matrix_ordered)),
  group = gsub("^(.*?) .*", "\\1", rownames(filtered_degs_matrix_ordered)),
  col = list(timepoint = row_col_time, group = row_col_group),
  show_legend = FALSE
)

heatmap1 <- Heatmap(filtered_degs_matrix_ordered,
  border = FALSE,
  rect_gp = gpar(type = "none"),
  show_row_names = FALSE, show_column_names = FALSE,
  name = "Jaccard similarity index",
  bottom_annotation = col_anno,
  right_annotation = row_anno,
  col = my_palette,
  clustering_method_row = "ward.D2",
  clustering_method_columns = "ward.D2",
  show_column_dend = FALSE,
  cell_fun = function(j, i, x, y, w, h, fill) {
    if (as.numeric(x) <= 1.1 - as.numeric(y)) {
      grid.rect(x, y, w, h, gp = gpar(fill = fill, col = fill))
    }
  }
)
heatmap1

Figure 4 A Cytokine Heatmap

Collection of significant differentially expressed genes of immune-associated groups of receptor ligands and receptors annotated in the HUGO database (total of 157). The bubble size represents the mean of absolute fold change when significant for both groups but the value itself when significant for only one of the groups.

4 gene sets were used based on the groups of Chemokine (45), Interferon (33), Interleukins (43), and TNF (18) ligands according to the HUGO gene nomenclature Committee.

Cut-off; FDR-adjusted p-value < 0.05. N = 15.

ls_gene_file <- list.files("../data/gene_sets", pattern = "231102|231128", full.names = TRUE)
ls_gene_sets <- lapply(ls_gene_file, read.csv, header = TRUE, sep = ",")
names(ls_gene_sets) <- sub("^(?:[^_]+_){2}([^_]+)_.*$", "\\1", ls_gene_file)
gene_sets <- rbindlist(ls_gene_sets, idcol = "gene_set", use.names = TRUE)

cytokine_heatmap <- gene_names_sign_df %>%
  pivot_longer(
    cols = starts_with("padj_") | starts_with("log2FoldChange_"),
    names_to = c(".value", "group"),
    names_pattern = "([\\p{L}]+)_(\\w+)"
  ) %>%
  filter(gene_symbol %in% gene_sets$Approved_symbol) %>%
  mutate(
    group = ifelse(group == "Na", "Naïve", group),
    group_timepoint = paste(timepoint, group)
  ) %>%
  group_by(gene_symbol, timepoint) %>%
  mutate(FoldChange_mean = case_when(
    degs_presence == "Both significant" ~ mean(FoldChange),
    degs_presence == "non_significant" ~ min(FoldChange),
    degs_presence %in% c("Conv only", "Naïve only") ~ max(FoldChange)
  )) %>%
  ungroup()

g2 <- cytokine_heatmap %>%
  group_by(gene_symbol) %>%
  filter(
    !all(degs_presence == "non_significant"),
    regulation %in% c("Upregulated", "Downregulated")
  ) %>%
  ungroup() %>%
  ggplot(aes(x = timepoint, y = gene_symbol, size = abs(FoldChange_mean), color = degs_presence)) +
  geom_point() +
  scale_color_manual(values = c("#D95F02", "#A9CAF6", "#A9A9A9", "grey90")) +
  labs(x = "", y = "", size = "FoldChange\n(Absolute log2)", color = "Significance") +
  theme_bw() +
  facet_wrap(~regulation, scales = "free_y") +
  theme_prism(base_line_size = .5, base_fontface = "plain", axis_text_angle = 45, base_size = 12) +
  theme(
    axis.ticks = element_line(size = .5), legend.position = "right",
    aspect.ratio = 1,
    legend.title = element_text()
  )
g2

Generate examples of cytokine DEGs

# interesting genes related to complement
plot_counts <- function(gene_interest, dds = dds1) {
  fiss <- plotCounts(dds, gene_interest,
    intgroup = c("dose", "group"), returnData = TRUE, transform = FALSE, normalized = TRUE, replaced = FALSE
  )

  g <- ggplot(
    fiss,
    aes(x = dose, y = count, color = group, group = group)
  ) +
    geom_point() +
    stat_summary(fun.y = mean, geom = "line") +
    scale_y_log10() +
    labs(x = "", y = "Normalized counts\n(median of ratios)") +
    scale_color_manual(values = c("#A9A9A9", "#A9CAF6")) +
    theme_prism(base_line_size = .5, base_fontface = "plain", axis_text_angle = 45) +
    theme(
      axis.ticks = element_line(size = .5), legend.position = "right",
      aspect.ratio = 1,
      legend.title = element_text()
    )
  g

  ggsave(g,
    filename = paste0("../", result.dir, gene_interest, "_cytokine_degs.pdf"),
    height = 5, width = 5
  )
}
plot_counts(gene_interest = "CXCL10")

plot_counts(gene_interest = "IL16")

plot_counts(gene_interest = "CD40LG")

plot_counts(gene_interest = "CCL5")

plot_counts(gene_interest = "TNF")

plot_counts(gene_interest = "TNFSF13B")

plot_counts(gene_interest = "IL1B")

plot_counts(gene_interest = "IL15")

Over representation analysis of transcriptional factors

Databases used; KEGG Pathway Database (figure 3A) and the MSigDB subset for Transcriptional Factors Targets (figure 3B).

DEGs with log2(fold change) greater than 1. Cut-off for DEGs was FDR-adjusted p-value < 0.05. N = 15.

msig_geneset <- msigdbr(species = "Homo sapiens", category = "C3")
msig_geneset <- msig_geneset %>% filter(gs_subcat %in% c("TFT:GTRD", "TFT:TFT_Legacy"))
tft_geneset <- msig_geneset %>%
  dplyr::select(gs_name, gene_symbol) %>%
  dplyr::rename(term = gs_name, gene = gene_symbol)

annot <- ensembldb::select(EnsDb.Hsapiens.v86,
  keys = keys(EnsDb.Hsapiens.v86),
  columns = c("ENTREZID", "SYMBOL")
)

degs <- lapply(degs, function(x) plyr::mapvalues(x, annot$GENEID, annot$SYMBOL, warn_missing = FALSE))

## map names with TFs
ora_tf <- lapply(degs, function(x) {
  enricher(x,
    pvalueCutoff = 1,
    pAdjustMethod = "fdr",
    qvalueCutoff = 1,
    TERM2GENE = tft_geneset
  )
})

ora_tf_df <- lapply(ora_tf, data.frame)
ora_tf_df_bind <- rbindlist(ora_tf_df, use.names = TRUE, idcol = "group_timepoint", fill = TRUE) %>% as.data.frame()

Figure 3B MSigDB

ora_tf_df_bind <- ora_tf_df_bind %>%
  mutate(
    timepoint = case_when(
      grepl("Dose 1", group_timepoint) ~ "Dose 1",
      grepl("Dose 2", group_timepoint) ~ "Dose 2",
      grepl("Dose 3 0-24", group_timepoint) ~ "Dose 3",
      grepl("Dose 3 0-48", group_timepoint) ~ "Dose 3 48h"
    ),
    group = case_when(
      grepl("Conv", group_timepoint) ~ "Conv",
      grepl("Naïve", group_timepoint) ~ "Naïve"
    ),
    timepoint_group = paste(timepoint, group, sep = "_")
  )

# Create the ggplot with the reordered x-axis

ora_tf_df_bind <- ora_tf_df_bind %>%
  filter(p.adjust < 0.05 & Count > 1) %>%
  mutate(
    ID_cut = case_when(
      grepl("TARGET", ID) ~ sub("_.*$", "", ID),
      str_count(ID, "_") > 1 ~ sub("^[^_]*_(.*?)_.*$", "\\1", ID),
      TRUE ~ sub("_.*$", "", ID),
    ),
    ID_cut = ifelse(ID_cut == "ICSBP", "IRF8/ICSBP", ID_cut)
  ) %>%
  filter(!(ID_cut %in% c("YNGTTNNNATT", "NFKAPPAB65", "NFKB"))) %>%
  # Remove duplicated ID by selecting the gene set with lowest p_value
  arrange(ID, p.adjust) %>%
  distinct(ID, timepoint_group, .keep_all = TRUE)

# create empty data.frame with Naive Dose 1 for aesthetics
data_empty <- data.frame(
  timepoint_group = "Dose 1_Naïve",
  ID_cut = unique(ora_tf_df_bind$ID_cut),
  p.adjust = 1
)

ora_tf_df_bind <- plyr::rbind.fill(ora_tf_df_bind, data_empty)

g3 <- ora_tf_df_bind %>%
  ggplot(aes(x = timepoint_group, y = ID_cut, fill = -log10(p.adjust))) +
  geom_tile() +
  theme_bw() +
  scale_fill_gradient2(low = "#265891", mid = "white", high = "#B80F20") +
  labs(x = "", y = "", fill = "Adjusted p-value\n(-log10)") +
  theme_prism(base_line_size = .5, base_fontface = "plain", axis_text_angle = 45) +
  theme(
    axis.ticks = element_line(size = .5), legend.position = "right",
    aspect.ratio = 1,
    legend.title = element_text()
  )
g3

Figure 3A KEGG Pathway

degs[["Naïve Dose 1 0-24 hours"]] <- "no_degs"
degs_entrez_id <- lapply(degs, function(x) plyr::mapvalues(x, from = annot$SYMBOL, to = annot$ENTREZID, warn_missing = FALSE))

compared_cluster <- compareCluster(degs_entrez_id, fun = "enrichKEGG", organism = "hsa")
cluster_termsim <- pairwise_termsim(compared_cluster, method = "JC")

# create empty data.frame with Naive Dose 1 for aesthetics
data_empty <- data.frame(
  Cluster = "Naïve Dose 1 0-24 hours",
  ID = cluster_termsim@compareClusterResult$ID,
  Description = cluster_termsim@compareClusterResult$Description,
  log10_padj = 0,
  pvalue = 1,
  p.adjust = 1,
  qvalue = 1,
  geneID = 0,
  Count = 1, GeneRatio = 0, BgRatio = 0
) %>% distinct()
cluster_termsim@compareClusterResult <- cluster_termsim@compareClusterResult %>%
  mutate(log10_padj = -log10(p.adjust))

cluster_termsim@compareClusterResult <- plyr::rbind.fill(cluster_termsim@compareClusterResult, data_empty)

# Set factor levels
cluster_termsim@compareClusterResult$Cluster <- factor(cluster_termsim@compareClusterResult$Cluster, levels = c(
  "Conv Dose 1 0-24 hours", "Naïve Dose 1 0-24 hours",
  "Conv Dose 2 0-24 hours", "Naïve Dose 2 0-24 hours",
  "Conv Dose 3 0-24 hours", "Naïve Dose 3 0-24 hours"
))


treeplot(cluster_termsim,
  showCategory = 31,
  label_format = 30,
  offset.params = list(
    tiplab = 9,
    bar_tree = 23
  ),
  cluster.params = list(
    n = 6,
    label_words_n = 0
  ),
  clusterPanel.params = list(
    colnames_angle = -90
  ),
  color = "log10_padj"
) +
  labs(x = "", fill = "Adjusted p-value\n(-log10)") +
  scale_fill_gradient2(low = "#265891", mid = "white", high = "#B80F20") +
  theme(
    axis.ticks = element_line(size = .5), legend.position = "right",
    aspect.ratio = 1,
    legend.title = element_text()
  )

Vaccine detection in RNA-seq

tpm_gene <- data.table::fread("../data/processed_data_nfcore/batch_1-2/salmon/salmon.merged.gene_tpm.tsv", header = TRUE, sep = "\t") %>%
  as.data.frame()

colnames(tpm_gene) <- plyr::mapvalues(sub("_L001|_L002", "", colnames(tpm_gene)), from = sampleTable$sample_ID, to = paste(sampleTable$subject_ID, sampleTable$dose, sampleTable$group, sep = "_"))

names_col <- colnames(tpm_gene)[3:101]
vaccine_tpm <- tpm_gene[64622:64625, ]

vaccine_plot <- tidyr::pivot_longer(vaccine_tpm, cols = names_col) %>%
  tidyr::separate(name, into = c("subject_ID", "time", "dose", "group"), sep = "_", remove = TRUE) %>%
  mutate(
    time_dose = paste(time, dose, sep = "_"),
    time_dose = factor(time_dose, levels = c(
      "0_dose1", "24_dose1",
      "0_dose2", "24_dose2",
      "0_dose3", "24_dose3", "48_dose3"
    )),
    group = case_when(
      group == "pos" ~ "Conv",
      group == "neg" ~ "Naïve"
    ),
    group = factor(group, levels = c("Conv", "Naïve"))
  ) %>%
  na.omit()

vaccine_plot %>%
  filter(grepl("pfizer", gene_id)) %>%
  ggplot(aes(x = time_dose, y = value, group = subject_ID, fill = group)) +
  geom_line() +
  geom_point(shape = 21, size = 5, color = "black", stroke = 1.2) +
  scale_fill_manual(values = c("#A9CAF6", "#A9A9A9")) +
  scale_y_log10(limits = c(0.001, 1000), breaks = c(0.01, 0.1, 1, 10, 100)) +
  labs(y = "Transcripts per million (log)", x = "") +
  ggprism::theme_prism() +
  theme(
    legend.position = "none",
    axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)
  ) +
  facet_wrap(~group, ncol = 4)